In [1]:
import pandas as pd
import pyarrow as pa

In [2]:
fn = './pageviews.csv'
df = pd.read_csv(fn)
df

Unnamed: 0,user_id,url,channel,received_at
0,8519,/products/90,organic_search,1742318201
1,6961,/products/733,organic_search,1742318201
2,8376,/products/382,referral,1742318201
3,8483,/products/750,organic_search,1742318201
4,3241,/products/133,social,1742318201
...,...,...,...,...
99995,725,/products/95,display,1742318201
99996,1062,/products/100,organic_search,1742318201
99997,5476,/products/37,display,1742318201
99998,8742,/products/655,social,1742318201


In [5]:
table = pa.Table.from_pandas(df)
table

pyarrow.Table
user_id: int64
url: string
channel: string
received_at: int64
----
user_id: [[8519,6961,8376,8483,3241,...,725,1062,5476,8742,5721]]
url: [["/products/90","/products/733","/products/382","/products/750","/products/133",...,"/products/95","/products/100","/products/37","/products/655","/products/237"]]
channel: [["organic_search","organic_search","referral","organic_search","social",...,"display","organic_search","display","social","display"]]
received_at: [[1742318201,1742318201,1742318201,1742318201,1742318201,...,1742318201,1742318201,1742318201,1742318201,1742318201]]

In [6]:
len(table)

100000

In [8]:
import pyarrow.parquet as pq

pq.write_table(table, 'pageviews.parquet')

In [9]:
table2 = pq.read_table('pageviews.parquet')

In [11]:
projected_table = pq.read_table('pageviews.parquet', columns=['user_id', 'channel'])
projected_table

pyarrow.Table
user_id: int64
channel: string
----
user_id: [[8519,6961,8376,8483,3241,...,725,1062,5476,8742,5721]]
channel: [["organic_search","organic_search","referral","organic_search","social",...,"display","organic_search","display","social","display"]]

In [12]:
parquet_file = pq.ParquetFile('pageviews.parquet')
print(f"Number of row groups: {parquet_file.num_row_groups}")
print("\nFile metadata:")
print(parquet_file.metadata)

Number of row groups: 1

File metadata:
<pyarrow._parquet.FileMetaData object at 0xffff47be9760>
  created_by: parquet-cpp-arrow version 19.0.1
  num_columns: 4
  num_rows: 100000
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 2830


In [13]:
for i in range(parquet_file.num_row_groups):
    print(f"\nRow group {i} metadata:")
    print(parquet_file.metadata.row_group(i))


Row group 0 metadata:
<pyarrow._parquet.RowGroupMetaData object at 0xffff44942f70>
  num_columns: 4
  num_rows: 100000
  total_byte_size: 435398
  sorting_columns: ()


In [14]:
pq.write_to_dataset(
    table,
    root_path='pageviews_ds',
    partition_cols=['channel']
)

In [15]:
dataset = pq.read_table('pageviews_ds')
print(dataset)

pyarrow.Table
user_id: int64
url: string
received_at: int64
channel: dictionary<values=string, indices=int32, ordered=0>
----
user_id: [[7307,164,9396,1266,5369,...,6776,9405,1394,4782,6730],[2485,1828,6425,3141,2040,...,4514,9256,3733,2380,5992],...,[6899,4167,9593,2043,1729,...,7547,1089,9164,8783,813],[6304,4872,473,3692,8204,...,5701,8199,3267,8556,8742]]
url: [["/products/87","/products/190","/products/722","/products/390","/products/159",...,"/products/355","/products/50","/products/335","/products/529","/products/786"],["/products/924","/products/860","/products/545","/products/747","/products/971",...,"/products/896","/products/381","/products/889","/products/307","/products/466"],...,["/products/183","/products/710","/products/318","/products/281","/products/308",...,"/products/583","/products/830","/products/333","/products/773","/products/831"],["/products/16","/products/838","/products/61","/products/56","/products/927",...,"/products/500","/products/427","/products/209","/

In [18]:
filters = [('channel', '=', 'social')]
dataset = pq.read_table('pageviews_ds', filters=filters)
print(dataset)

pyarrow.Table
user_id: int64
url: string
received_at: int64
channel: dictionary<values=string, indices=int32, ordered=0>
----
user_id: [[3241,6881,621,8199,794,...,9266,1794,9673,4592,3769],[8274,8051,8770,7099,6426,...,7341,4027,7813,9406,7495],...,[6899,4167,9593,2043,1729,...,7547,1089,9164,8783,813],[6304,4872,473,3692,8204,...,5701,8199,3267,8556,8742]]
url: [["/products/133","/products/995","/products/63","/products/856","/products/104",...,"/products/323","/products/939","/products/712","/products/775","/products/333"],["/products/150","/products/712","/products/516","/products/851","/products/641",...,"/products/981","/products/426","/products/236","/products/458","/products/3"],...,["/products/183","/products/710","/products/318","/products/281","/products/308",...,"/products/583","/products/830","/products/333","/products/773","/products/831"],["/products/16","/products/838","/products/61","/products/56","/products/927",...,"/products/500","/products/427","/products/209","/pr