In [49]:
import tempfile
import pathlib
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

import pyarrow.compute as pc
from pyarrow import csv

import os
import polars as pl
import math

In [4]:
def get_size(path, unit='MB'):
    size = os.path.getsize(path)
    if unit == 'MB':
        print(f'Size: {size / (1024 * 1024)} MB')
    elif unit == 'GB':
        print(f'Size: {size / (1024 * 1024 * 1024)} GB')
    else:
        print(f'Size: {size} B')

path = './data/test/playlist_2010to2022.parquet'

get_size(path=path)

Size: 0.2697105407714844 MB


In [18]:
path = 'data/test2/audio_features.csv'
get_size(path, 'MB')

Size: 458.2281322479248 MB


In [20]:
df = pl.read_csv(path)
df.head()

isrc,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,updated_on
str,f64,f64,i64,f64,f64,i64,f64,f64,i64,f64,i64,i64,f64,str
"""AD4X65752184""",0.906,0.65,296733,0.35,0.775,3,0.114,-11.777,1,0.0284,110,4,0.336,"""2023-08-24 09:…"
"""AEA0D1991170""",0.00095,0.621,191989,0.639,0.82,7,0.506,-9.258,1,0.0775,140,4,0.346,"""2023-08-24 09:…"
"""AEA0Q2004008""",0.0312,0.692,282904,0.98,0.866,9,0.0969,-9.952,1,0.0881,140,4,0.225,"""2023-08-24 09:…"
"""AEA0Q2004009""",0.000558,0.516,197904,0.924,0.434,5,0.153,-13.564,0,0.22,148,4,0.204,"""2023-08-24 09:…"
"""AEA0Q2004010""",0.00938,0.587,199471,0.91,0.804,0,0.563,-10.552,1,0.0596,136,4,0.321,"""2023-08-24 09:…"


In [21]:
table = csv.read_csv(path)
table

pyarrow.Table
isrc: string
acousticness: double
danceability: double
duration_ms: int64
energy: double
instrumentalness: double
key: int64
liveness: double
loudness: double
mode: int64
speechiness: double
tempo: int64
time_signature: int64
valence: double
updated_on: timestamp[s]
----
isrc: [["AD4X65752184","AEA0D1991170","AEA0Q2004008","AEA0Q2004009","AEA0Q2004010",...,"DEAR41525750","DEAR41525761","DEAR41525763","DEAR41526078","DEAR41526200"],["DEAR41526214","DEAR41526249","DEAR41526295","DEAR41526357","DEAR41526411",...,"DEGD91107503","DEGD91107505","DEGD91107507","DEGD91107508","DEGD91107509"],...,["CH6541664408","CH6542352743","NLRD52200994","ITZB42136724","NLHR22100856",...,"DEAR41870698","US8FK1100058","QZ5FN1825826","GBX6A2200154","GBV8E1549001"],["DECH60912164","NLHR22200278","DEAR42267680","AEA2M2348807","CA5KR1510187",...,"DEY472375687","DEFR71400907","GB7NR1743402","DEZN82304108","DEY471812426"]]
acousticness: [[0.906,0.00095,0.0312,0.000558,0.00938,...,0.0098,0.14,0.341,0.

In [None]:
table['isrc']

In [15]:
table.num_rows

4687104

In [17]:
table.slice(length=2)

pyarrow.Table
isrc: string
acousticness: double
danceability: double
duration_ms: int64
energy: double
instrumentalness: double
key: int64
liveness: double
loudness: double
mode: int64
speechiness: double
tempo: int64
time_signature: int64
valence: double
updated_on: timestamp[s]
----
isrc: [["AD4X65752184","AEA0D1991170"]]
acousticness: [[0.906,0.00095]]
danceability: [[0.65,0.621]]
duration_ms: [[296733,191989]]
energy: [[0.35,0.639]]
instrumentalness: [[0.775,0.82]]
key: [[3,7]]
liveness: [[0.114,0.506]]
loudness: [[-11.777,-9.258]]
mode: [[1,1]]
...

In [28]:
pq.write_table(table=table, where='./data/test2/audio_features.parquet')

In [40]:
path = './data/test2/audio_features.parquet'

In [56]:
pf = pq.ParquetFile(path)
nrows = pf.metadata.num_rows
n_partitions = math.ceil(os.path.getsize(path) / (1024 ** 2) / 50)
batch_size = nrows / n_partitions
first_n_rows = next(pf.iter_batches(batch_size = batch_size))

In [58]:
first_n_rows

pyarrow.RecordBatch
isrc: string
acousticness: double
danceability: double
duration_ms: int64
energy: double
instrumentalness: double
key: int64
liveness: double
loudness: double
mode: int64
speechiness: double
tempo: int64
time_signature: int64
valence: double
updated_on: timestamp[ms]
----
isrc: ["AD4X65752184","AEA0D1991170","AEA0Q2004008","AEA0Q2004009","AEA0Q2004010","AEA0Q2004011","AEA0Q2004012","AEA0Q2004013","AEA0Q2004014","AEA0Q2004015",...,"RUA491403963","RUA491403964","RUA491403965","RUA491403966","RUA491403967","RUA491403968","RUA491403976","RUA491404101","RUA491404103","RUA491404144"]
acousticness: [0.906,0.00095,0.0312,0.000558,0.00938,0.00854,0.000251,0.0053,0.0179,0.00125,...,0.655,0.0346,0.00011,0.00298,0.117,0.197,0.0613,0.00265,0.000435,0.0000514]
danceability: [0.65,0.621,0.692,0.516,0.587,0.656,0.552,0.713,0.52,0.628,...,0.53,0.727,0.585,0.795,0.634,0.693,0.886,0.85,0.939,0.835]
duration_ms: [296733,191989,282904,197904,199471,266736,292779,192784,396851,163995,...

In [36]:
pf.metadata.num_rows

4687104

In [66]:
pf.metadata

<pyarrow._parquet.FileMetaData object at 0x11423bec0>
  created_by: parquet-cpp-arrow version 13.0.0
  num_columns: 15
  num_rows: 4687104
  num_row_groups: 5
  format_version: 2.6
  serialized_size: 10131

In [31]:
help(pf.iter_batches)

Help on method iter_batches in module pyarrow.parquet.core:

iter_batches(batch_size=65536, row_groups=None, columns=None, use_threads=True, use_pandas_metadata=False) method of pyarrow.parquet.core.ParquetFile instance
    Read streaming batches from a Parquet file.
    
    Parameters
    ----------
    batch_size : int, default 64K
        Maximum number of records to yield per batch. Batches may be
        smaller if there aren't enough rows in the file.
    row_groups : list
        Only these row groups will be read from the file.
    columns : list
        If not None, only these columns will be read from the file. A
        column name may be a prefix of a nested field, e.g. 'a' will select
        'a.b', 'a.c', and 'a.d.e'.
    use_threads : boolean, default True
        Perform multi-threaded column reads.
    use_pandas_metadata : boolean, default False
        If True and file has custom pandas schema metadata, ensure that
        index columns are also loaded.
    
    Yie

In [60]:
pl.read_csv('data/test/playlist_2010to2022.csv', batch_size=2)

playlist_url,year,track_id,track_name,track_popularity,album,artist_id,artist_name,artist_genres,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
str,i64,str,str,i64,str,str,str,str,i64,f64,f64,i64,f64,i64,f64,f64,f64,f64,f64,f64,i64,i64
"""https://open.s…",2000,"""3AJwUDP919kvQ9…","""Yellow""",91,"""Parachutes""","""4gzpq5DPGxSnKT…","""Coldplay""","""['permanent wa…",86,0.429,0.661,11,-7.227,1,0.0281,0.00239,0.000121,0.234,0.285,173.372,266773,4
"""https://open.s…",2000,"""2m1hi0nfMR9vdG…","""All The Small …",84,"""Enema Of The S…","""6FBDaR13swtiWw…","""blink-182""","""['alternative …",75,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,167067,4
"""https://open.s…",2000,"""3y4LxiYMgDl4Re…","""Breathe""",69,"""Breathe""","""25NQNriVT2YbSW…","""Faith Hill""","""['contemporary…",61,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,250547,4
"""https://open.s…",2000,"""60a0Rd6pjrkxjP…","""In the End""",88,"""Hybrid Theory …","""6XyY86QOPPrYVG…","""Linkin Park""","""['alternative …",83,0.556,0.864,3,-5.87,0,0.0584,0.00958,0.0,0.209,0.4,105.143,216880,4
"""https://open.s…",2000,"""62bOmKYxYg7dhr…","""Bye Bye Bye""",74,"""No Strings Att…","""6Ff53KvcvAj5U7…","""*NSYNC""","""['boy band', '…",65,0.61,0.926,8,-4.843,0,0.0479,0.031,0.0012,0.0821,0.861,172.638,200400,4
"""https://open.s…",2000,"""5Mmk2ii6laakqf…","""Thong Song""",73,"""Unleash The Dr…","""6x9QLdzo6eBZxJ…","""Sisqo""","""['contemporary…",56,0.706,0.888,2,-6.959,1,0.0654,0.119,0.0000964,0.07,0.714,121.549,253733,4
"""https://open.s…",2000,"""3yfqSUWxFvZELE…","""The Real Slim …",88,"""The Marshall M…","""7dGJo4pcD2V6oG…","""Eminem""","""['detroit hip …",88,0.949,0.661,5,-4.244,0,0.0572,0.0302,0.0,0.0454,0.76,104.504,284200,4
"""https://open.s…",2000,"""7oQSevUCbYs4Qa…","""Rock DJ""",57,"""Sing When You'…","""2HcwFjNelS49kF…","""Robbie William…","""['dance rock',…",69,0.712,0.762,7,-4.307,1,0.0326,0.026,0.0,0.0981,0.842,103.032,260560,4
"""https://open.s…",2000,"""7H6ev70Weq6Ddp…","""Say My Name""",80,"""The Writing's …","""1Y8cdNmUJH7yBT…","""Destiny's Chil…","""['dance pop', …",69,0.713,0.678,5,-3.525,0,0.102,0.273,0.0,0.149,0.734,138.009,271333,4
"""https://open.s…",2000,"""64BbK9SFKH2jk8…","""Otherside""",83,"""Californicatio…","""0L8ExT028jH3dd…","""Red Hot Chili …","""['alternative …",80,0.458,0.795,0,-3.265,1,0.0574,0.00316,0.000202,0.0756,0.513,123.229,255373,4


In [61]:
from subprocess import check_output

def wc(filename):
    return int(check_output(["wc", "-l", filename]).split()[0])