In [1]:
import tempfile
import pathlib
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

import pyarrow.compute as pc
from pyarrow import csv

import os
import polars as pl
import math

In [49]:
def get_size(path, unit='MB'):
    size = os.path.getsize(path)
    if unit == 'MB':
        print(f'Size: {size / (1024 * 1024)} MB')
    elif unit == 'GB':
        print(f'Size: {size / (1024 * 1024 * 1024)} GB')
    else:
        print(f'Size: {size} B')

path = './data/test/playlist_2010to2022.parquet'

get_size(path=path)

Size: 0.2697105407714844 MB


In [50]:
path = 'data/test2/audio_features.csv'
get_size(path, 'MB')

Size: 458.2281322479248 MB


In [51]:
df = pl.read_csv(path)
df.head()

isrc,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,updated_on
str,f64,f64,i64,f64,f64,i64,f64,f64,i64,f64,i64,i64,f64,str
"""AD4X65752184""",0.906,0.65,296733,0.35,0.775,3,0.114,-11.777,1,0.0284,110,4,0.336,"""2023-08-24 09:…"
"""AEA0D1991170""",0.00095,0.621,191989,0.639,0.82,7,0.506,-9.258,1,0.0775,140,4,0.346,"""2023-08-24 09:…"
"""AEA0Q2004008""",0.0312,0.692,282904,0.98,0.866,9,0.0969,-9.952,1,0.0881,140,4,0.225,"""2023-08-24 09:…"
"""AEA0Q2004009""",0.000558,0.516,197904,0.924,0.434,5,0.153,-13.564,0,0.22,148,4,0.204,"""2023-08-24 09:…"
"""AEA0Q2004010""",0.00938,0.587,199471,0.91,0.804,0,0.563,-10.552,1,0.0596,136,4,0.321,"""2023-08-24 09:…"


In [52]:
table = csv.read_csv(path)
table

pyarrow.Table
isrc: string
acousticness: double
danceability: double
duration_ms: int64
energy: double
instrumentalness: double
key: int64
liveness: double
loudness: double
mode: int64
speechiness: double
tempo: int64
time_signature: int64
valence: double
updated_on: timestamp[s]
----
isrc: [["AD4X65752184","AEA0D1991170","AEA0Q2004008","AEA0Q2004009","AEA0Q2004010",...,"DEAR41525750","DEAR41525761","DEAR41525763","DEAR41526078","DEAR41526200"],["DEAR41526214","DEAR41526249","DEAR41526295","DEAR41526357","DEAR41526411",...,"DEGD91107503","DEGD91107505","DEGD91107507","DEGD91107508","DEGD91107509"],...,["CH6541664408","CH6542352743","NLRD52200994","ITZB42136724","NLHR22100856",...,"DEAR41870698","US8FK1100058","QZ5FN1825826","GBX6A2200154","GBV8E1549001"],["DECH60912164","NLHR22200278","DEAR42267680","AEA2M2348807","CA5KR1510187",...,"DEY472375687","DEFR71400907","GB7NR1743402","DEZN82304108","DEY471812426"]]
acousticness: [[0.906,0.00095,0.0312,0.000558,0.00938,...,0.0098,0.14,0.341,0.

In [53]:
table['isrc']

<pyarrow.lib.ChunkedArray object at 0x7f410c1c74c0>
[
  [
    "AD4X65752184",
    "AEA0D1991170",
    "AEA0Q2004008",
    "AEA0Q2004009",
    "AEA0Q2004010",
    ...
    "DEAR41525750",
    "DEAR41525761",
    "DEAR41525763",
    "DEAR41526078",
    "DEAR41526200"
  ],
  [
    "DEAR41526214",
    "DEAR41526249",
    "DEAR41526295",
    "DEAR41526357",
    "DEAR41526411",
    ...
    "DEGD91107503",
    "DEGD91107505",
    "DEGD91107507",
    "DEGD91107508",
    "DEGD91107509"
  ],
...,
  [
    "CH6541664408",
    "CH6542352743",
    "NLRD52200994",
    "ITZB42136724",
    "NLHR22100856",
    ...
    "DEAR41870698",
    "US8FK1100058",
    "QZ5FN1825826",
    "GBX6A2200154",
    "GBV8E1549001"
  ],
  [
    "DECH60912164",
    "NLHR22200278",
    "DEAR42267680",
    "AEA2M2348807",
    "CA5KR1510187",
    ...
    "DEY472375687",
    "DEFR71400907",
    "GB7NR1743402",
    "DEZN82304108",
    "DEY471812426"
  ]
]

In [54]:
table.num_rows

4687104

In [55]:
table.slice(length=2)

pyarrow.Table
isrc: string
acousticness: double
danceability: double
duration_ms: int64
energy: double
instrumentalness: double
key: int64
liveness: double
loudness: double
mode: int64
speechiness: double
tempo: int64
time_signature: int64
valence: double
updated_on: timestamp[s]
----
isrc: [["AD4X65752184","AEA0D1991170"]]
acousticness: [[0.906,0.00095]]
danceability: [[0.65,0.621]]
duration_ms: [[296733,191989]]
energy: [[0.35,0.639]]
instrumentalness: [[0.775,0.82]]
key: [[3,7]]
liveness: [[0.114,0.506]]
loudness: [[-11.777,-9.258]]
mode: [[1,1]]
...

In [56]:
pq.write_table(table=table, where='./data/test2/audio_features.parquet')

In [57]:
path = './data/test2/audio_features.parquet'

In [58]:
pf = pq.ParquetFile(path)
nrows = pf.metadata.num_rows
n_partitions = math.ceil(os.path.getsize(path) / (1024 ** 2) / 50)
batch_size = nrows / n_partitions
first_n_rows = next(pf.iter_batches(batch_size = batch_size))

In [59]:
first_n_rows

pyarrow.RecordBatch
isrc: string
acousticness: double
danceability: double
duration_ms: int64
energy: double
instrumentalness: double
key: int64
liveness: double
loudness: double
mode: int64
speechiness: double
tempo: int64
time_signature: int64
valence: double
updated_on: timestamp[ms]
----
isrc: ["AD4X65752184","AEA0D1991170","AEA0Q2004008","AEA0Q2004009","AEA0Q2004010","AEA0Q2004011","AEA0Q2004012","AEA0Q2004013","AEA0Q2004014","AEA0Q2004015",...,"RUA491403963","RUA491403964","RUA491403965","RUA491403966","RUA491403967","RUA491403968","RUA491403976","RUA491404101","RUA491404103","RUA491404144"]
acousticness: [0.906,0.00095,0.0312,0.000558,0.00938,0.00854,0.000251,0.0053,0.0179,0.00125,...,0.655,0.0346,0.00011,0.00298,0.117,0.197,0.0613,0.00265,0.000435,0.0000514]
danceability: [0.65,0.621,0.692,0.516,0.587,0.656,0.552,0.713,0.52,0.628,...,0.53,0.727,0.585,0.795,0.634,0.693,0.886,0.85,0.939,0.835]
duration_ms: [296733,191989,282904,197904,199471,266736,292779,192784,396851,163995,...

In [60]:
pf.metadata.num_rows

4687104

In [61]:
pf.metadata

<pyarrow._parquet.FileMetaData object at 0x7f410c1c6660>
  created_by: parquet-cpp-arrow version 13.0.0
  num_columns: 15
  num_rows: 4687104
  num_row_groups: 5
  format_version: 2.6
  serialized_size: 10131

In [31]:
help(pf.iter_batches)

Help on method iter_batches in module pyarrow.parquet.core:

iter_batches(batch_size=65536, row_groups=None, columns=None, use_threads=True, use_pandas_metadata=False) method of pyarrow.parquet.core.ParquetFile instance
    Read streaming batches from a Parquet file.
    
    Parameters
    ----------
    batch_size : int, default 64K
        Maximum number of records to yield per batch. Batches may be
        smaller if there aren't enough rows in the file.
    row_groups : list
        Only these row groups will be read from the file.
    columns : list
        If not None, only these columns will be read from the file. A
        column name may be a prefix of a nested field, e.g. 'a' will select
        'a.b', 'a.c', and 'a.d.e'.
    use_threads : boolean, default True
        Perform multi-threaded column reads.
    use_pandas_metadata : boolean, default False
        If True and file has custom pandas schema metadata, ensure that
        index columns are also loaded.
    
    Yie

Get metadata associated with a field

In [31]:
pq.read_table('./data/test2/audio_features.parquet').schema.field('energy').metadata

### Using `yield` in iterator function to avoid returning entire dataframe

In [38]:
## setup
import polars as pl
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from pathlib import Path
import datetime


DATA_PATH = Path('/home/flemm0/school_stuff/USC_Fall_2023/DSCI551-Final_Project/data/')
TEST_DB_PATH = Path(DATA_PATH / 'test')
TEMP_DB_PATH = Path(DATA_PATH / 'temp')


## step 1: select
def read_table(table_name):
    dataset = ds.dataset(TEST_DB_PATH / table_name, format='parquet')
    for partition in dataset.files:
        partition = Path(partition)
        data = pq.read_table(partition)
        yield data, partition.stem


step = 0
query_id = 'query_' + datetime.datetime.now().strftime("%y%m%d_%H%M%S") 
query_step_dir = query_id + '_' + str(step)

curr_query_path = Path(TEMP_DB_PATH / query_step_dir)
if not curr_query_path.exists():
    Path.mkdir(curr_query_path)

for partition, name in read_table('audio_features'):
    where = (curr_query_path / name).with_suffix('.parquet')
    pq.write_table(table=partition, where=(curr_query_path / name).with_suffix('.parquet'))


## step 2: where
def filter(prev_query_path, filters):
    dataset = ds.dataset(prev_query_path, format='parquet')
    for partition in dataset.files:
        partition = Path(partition)
        data = pq.read_table(partition, filters=filters) # list of tuples e.g. ('acousticness', '<', 1)
        yield data, partition.stem

prev_query_path = curr_query_path
step += 1
query_step_dir = query_id + '_' + str(step)
curr_query_path = Path(TEMP_DB_PATH / query_step_dir)
if not curr_query_path.exists():
    Path.mkdir(curr_query_path)
    for partition, name in filter(prev_query_path=prev_query_path, filters=[('acousticness', '<', 1)]):
        pq.write_table(table=partition, where=(curr_query_path / name).with_suffix('.parquet'))

In [42]:
## test to make sure filter is correct

dataset = ds.dataset(curr_query_path)
for f in dataset.files:
    data = pq.read_table(f, filters=[('acousticness', '>', 1)])
    print(pl.DataFrame._from_arrow(data))

## works!!

shape: (0, 15)
┌──────┬──────────────┬─────────────┬─────────────┬───┬───────┬─────────────┬─────────┬────────────┐
│ isrc ┆ acousticness ┆ danceabilit ┆ duration_ms ┆ … ┆ tempo ┆ time_signat ┆ valence ┆ updated_on │
│ ---  ┆ ---          ┆ y           ┆ ---         ┆   ┆ ---   ┆ ure         ┆ ---     ┆ ---        │
│ str  ┆ f64          ┆ ---         ┆ i64         ┆   ┆ i64   ┆ ---         ┆ f64     ┆ str        │
│      ┆              ┆ f64         ┆             ┆   ┆       ┆ i64         ┆         ┆            │
╞══════╪══════════════╪═════════════╪═════════════╪═══╪═══════╪═════════════╪═════════╪════════════╡
└──────┴──────────────┴─────────────┴─────────────┴───┴───────┴─────────────┴─────────┴────────────┘
shape: (0, 15)
┌──────┬──────────────┬─────────────┬─────────────┬───┬───────┬─────────────┬─────────┬────────────┐
│ isrc ┆ acousticness ┆ danceabilit ┆ duration_ms ┆ … ┆ tempo ┆ time_signat ┆ valence ┆ updated_on │
│ ---  ┆ ---          ┆ y           ┆ ---         ┆   ┆ ---  

`pyarrow` tabular datasets

In [74]:
import pyarrow.dataset as ds
import pathlib
import sys

dataset = ds.dataset(source=['data/test2/audio_features_0.parquet', 'data/test2/audio_features_1.parquet'], format='parquet')
batches = dataset.to_batches()


# print(f'dataset object size in main memory: {sys.getsizeof(dataset) / (2 ** 20)} MB')
# print(f'batches object size in main memory: {sys.getsizeof(batches) / (2 ** 20)} MB')
# print(f'first batch size in main memory: {sys.getsizeof(next(batches)) / (2 ** 20)} MB')

def select_all_from_table(dataset):
    batches = dataset.to_batches()
    truncated_dataset = dataset.head(100)
    head = pl.DataFrame._from_arrow(truncated_dataset)

    pl.Config.set_tbl_hide_dataframe_shape(True)
    print(f'shape: ({dataset.count_rows()}, {df.shape[1]})')
    print(head)
    #pl.Config.set_tbl_hide_dataframe_shape(False)



In [75]:
select_all_from_table(dataset)

shape: (1874842, 15)
shape: (100, 15)
┌────────────┬────────────┬────────────┬────────────┬───┬───────┬────────────┬─────────┬───────────┐
│ isrc       ┆ acousticne ┆ danceabili ┆ duration_m ┆ … ┆ tempo ┆ time_signa ┆ valence ┆ updated_o │
│ ---        ┆ ss         ┆ ty         ┆ s          ┆   ┆ ---   ┆ ture       ┆ ---     ┆ n         │
│ str        ┆ ---        ┆ ---        ┆ ---        ┆   ┆ i64   ┆ ---        ┆ f64     ┆ ---       │
│            ┆ f64        ┆ f64        ┆ i64        ┆   ┆       ┆ i64        ┆         ┆ str       │
╞════════════╪════════════╪════════════╪════════════╪═══╪═══════╪════════════╪═════════╪═══════════╡
│ AD4X657521 ┆ 0.906      ┆ 0.65       ┆ 296733     ┆ … ┆ 110   ┆ 4          ┆ 0.336   ┆ 2023-08-2 │
│ 84         ┆            ┆            ┆            ┆   ┆       ┆            ┆         ┆ 4         │
│            ┆            ┆            ┆            ┆   ┆       ┆            ┆         ┆ 09:27:00  │
│ AEA0D19911 ┆ 0.00095    ┆ 0.621      ┆ 191989     ┆

In [1]:
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pathlib
import polars as pl

base = pathlib.Path('/home/flemm0/school_stuff/USC_Fall_2023/DSCI551-Final_Project/data/test/')

dataset = ds.dataset(base / 'audio_features', format='parquet')

dataset.files

['/home/flemm0/school_stuff/USC_Fall_2023/DSCI551-Final_Project/data/test/audio_features/audio_features_0.parquet',
 '/home/flemm0/school_stuff/USC_Fall_2023/DSCI551-Final_Project/data/test/audio_features/audio_features_1.parquet',
 '/home/flemm0/school_stuff/USC_Fall_2023/DSCI551-Final_Project/data/test/audio_features/audio_features_2.parquet',
 '/home/flemm0/school_stuff/USC_Fall_2023/DSCI551-Final_Project/data/test/audio_features/audio_features_3.parquet',
 '/home/flemm0/school_stuff/USC_Fall_2023/DSCI551-Final_Project/data/test/audio_features/audio_features_4.parquet']

In [2]:
batches = dataset.to_batches()

nxt = next(batches)

pl.DataFrame._from_arrow(nxt).rows()

[('AD4X65752184',
  0.906,
  0.65,
  296733,
  0.35,
  0.775,
  3,
  0.114,
  -11.777,
  1,
  0.0284,
  110,
  4,
  0.336,
  '2023-08-24 09:27:00'),
 ('AEA0D1991170',
  0.00095,
  0.621,
  191989,
  0.639,
  0.82,
  7,
  0.506,
  -9.258,
  1,
  0.0775,
  140,
  4,
  0.346,
  '2023-08-24 09:30:04'),
 ('AEA0Q2004008',
  0.0312,
  0.692,
  282904,
  0.98,
  0.866,
  9,
  0.0969,
  -9.952,
  1,
  0.0881,
  140,
  4,
  0.225,
  '2023-08-24 09:30:04'),
 ('AEA0Q2004009',
  0.000558,
  0.516,
  197904,
  0.924,
  0.434,
  5,
  0.153,
  -13.564,
  0,
  0.22,
  148,
  4,
  0.204,
  '2023-08-24 09:30:04'),
 ('AEA0Q2004010',
  0.00938,
  0.587,
  199471,
  0.91,
  0.804,
  0,
  0.563,
  -10.552,
  1,
  0.0596,
  136,
  4,
  0.321,
  '2023-08-24 09:30:04'),
 ('AEA0Q2004011',
  0.00854,
  0.656,
  266736,
  0.834,
  0.865,
  6,
  0.0614,
  -12.853,
  0,
  0.0507,
  142,
  4,
  0.685,
  '2023-08-24 09:30:04'),
 ('AEA0Q2004012',
  0.000251,
  0.552,
  292779,
  0.96,
  0.792,
  5,
  0.102,
  -10.45,
 

In [55]:
pq.ParquetFile(dataset.files[0]).metadata.num_row_groups

3

## Filtering

In [12]:
for partition in dataset.files:
    data = pq.read_table(partition, filters=[('acousticness', '<', 1)])
    data = pl.DataFrame._from_arrow(data)
    print(data)

shape: (2, 15)
┌────────────┬────────────┬────────────┬────────────┬───┬───────┬────────────┬─────────┬───────────┐
│ isrc       ┆ acousticne ┆ danceabili ┆ duration_m ┆ … ┆ tempo ┆ time_signa ┆ valence ┆ updated_o │
│ ---        ┆ ss         ┆ ty         ┆ s          ┆   ┆ ---   ┆ ture       ┆ ---     ┆ n         │
│ str        ┆ ---        ┆ ---        ┆ ---        ┆   ┆ i64   ┆ ---        ┆ f64     ┆ ---       │
│            ┆ f64        ┆ f64        ┆ i64        ┆   ┆       ┆ i64        ┆         ┆ str       │
╞════════════╪════════════╪════════════╪════════════╪═══╪═══════╪════════════╪═════════╪═══════════╡
│ AD4X657521 ┆ 0.906      ┆ 0.65       ┆ 296733     ┆ … ┆ 110   ┆ 4          ┆ 0.336   ┆ 2023-08-2 │
│ 84         ┆            ┆            ┆            ┆   ┆       ┆            ┆         ┆ 4         │
│            ┆            ┆            ┆            ┆   ┆       ┆            ┆         ┆ 09:27:00  │
│ AEA0D19911 ┆ 0.00095    ┆ 0.621      ┆ 191989     ┆ … ┆ 140   ┆ 4         

In [14]:
import sys

sys.getsizeof(dataset)

88

## Hash Join

In [13]:
import polars as pl
from collections import defaultdict
import sys
from pprint import pprint

def hash_join(table1: pl.DataFrame, index1, table2: pl.DataFrame, index2):
    new_headers = []
    for c in table1.columns:
        if c in table2.columns:
            new_headers.append(c + '_l')
        else:
            new_headers.append(c)
    for c in table2.columns:
        if c in table1.columns:
            new_headers.append(c + '_r')
        else:
            new_headers.append(c)

    table1, table2 = table1.rows(), table2.rows()
    h = defaultdict(list)
    # hash phase
    for s in table1:
        h[s[index1]].append(s)
    # join phase
    pprint(h)
    res = [(s + r) for r in table2 for s in h[r[index2]]]

    return pl.DataFrame._from_records(res, schema=new_headers)
    

df1 = pl.DataFrame({
    'age': [27, 18, 28, 18, 28],
    'name': ["Jonah", "Alan", "Glory", "Popeye", "Alan"]
    })

df2 = pl.DataFrame({
    'name': ["Jonah", "Jonah", "Alan", "Alan", "Glory"],
    'word': ['Whales', 'Spiders', 'Ghosts', 'Zombies', 'Buffy']
})

hash_join(df1, 1, df2, 0)

defaultdict(<class 'list'>,
            {'Alan': [(18, 'Alan'), (28, 'Alan')],
             'Glory': [(28, 'Glory')],
             'Jonah': [(27, 'Jonah')],
             'Popeye': [(18, 'Popeye')]})


age,name_l,name_r,word
i64,str,str,str
27,"""Jonah""","""Jonah""","""Whales"""
27,"""Jonah""","""Jonah""","""Spiders"""
18,"""Alan""","""Alan""","""Ghosts"""
28,"""Alan""","""Alan""","""Ghosts"""
18,"""Alan""","""Alan""","""Zombies"""
28,"""Alan""","""Alan""","""Zombies"""
28,"""Glory""","""Glory""","""Buffy"""


In [None]:
def hash_join_with_partitions(table1: pyarrow.dataset, index1, table2: pyarrow.dataset, index2):
    '''implement hash join that accepts table partitions
    
    the hash phase should wrap a for loop above `for s in table1` for all the partitions and store the join values in the hash
    '''

    hash_table = defaultdict(list)
    result = []
    # hash phase
    for batch in table1.to_batches():
        rows = pl.DataFrame._from_arrow(batch).rows()
        for row in rows:
            hash_table[row[index1]].append(row)

    # join phase
    for batch in table2.to_batches():
        rows = pl.DataFrame._from_arrow(batch).rows()
        for row in rows:
            for entry in hash_table[row[index2]]:
                result.append(entry + row)