# Reading and Writing Parquet Files
source:  https://arrow.apache.org/docs/python/parquet.html

In [3]:
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa

In [4]:
# read S3 file into a data frame and show its data & metadata
df = pd.read_csv('https://www1.ncdc.noaa.gov/pub/data/cdo/samples/PRECIP_HLY_sample_csv.csv')
df.head(30)

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,HPCP,Measurement Flag,Quality Flag
0,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100101 00:00,99999,],
1,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100101 01:00,0,g,
2,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100102 06:00,1,,


In [6]:
table = pa.Table.from_pandas(df)
pq.write_table(table, 'example.parquet')

In [10]:
table2 = pq.read_table('example.parquet')
table2.to_pandas().head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [11]:
# read specific columns
pq.read_table('example.parquet', columns=['age', 'job']).to_pandas().head()

Unnamed: 0,age,job
0,30,unemployed
1,33,services
2,35,management
3,30,management
4,59,blue-collar


In [12]:
# read specific columns, if previously written by Pandas (read extra metadata)
pq.read_pandas('example.parquet', columns=['age', 'job']).to_pandas().head()

Unnamed: 0,age,job
0,30,unemployed
1,33,services
2,35,management
3,30,management
4,59,blue-collar


In [14]:
parquet_file = pq.ParquetFile('example.parquet')
parquet_file.schema

<pyarrow._parquet.ParquetSchema object at 0x0000028AA282B7F0>
age: INT64
job: BYTE_ARRAY UTF8
marital: BYTE_ARRAY UTF8
education: BYTE_ARRAY UTF8
default: BYTE_ARRAY UTF8
balance: INT64
housing: BYTE_ARRAY UTF8
loan: BYTE_ARRAY UTF8
contact: BYTE_ARRAY UTF8
day: INT64
month: BYTE_ARRAY UTF8
duration: INT64
campaign: INT64
pdays: INT64
previous: INT64
poutcome: BYTE_ARRAY UTF8
y: BYTE_ARRAY UTF8
__index_level_0__: INT64
 

In [15]:
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x0000028AA2484F98>
  created_by: parquet-cpp version 1.4.1-SNAPSHOT
  num_columns: 18
  num_rows: 4521
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 3912

In [17]:
# write to multiple partitioned files 
pq.write_to_dataset(table, root_path='my_parq', partition_cols=['job'])

In [18]:
# read partitioned table
table3 = pq.read_table('my_parq')
table3.to_pandas().head()

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job
11,43,married,secondary,no,264,yes,no,cellular,17,apr,113,2,-1,0,unknown,no,admin
17,37,single,tertiary,no,2317,yes,no,cellular,20,apr,114,1,152,2,failure,no,admin
29,53,married,secondary,no,105,no,yes,cellular,21,aug,74,2,-1,0,unknown,no,admin
35,42,divorced,secondary,no,1811,yes,no,unknown,14,may,150,1,-1,0,unknown,no,admin
49,61,married,unknown,no,4629,yes,no,cellular,27,jan,181,1,92,1,success,yes,admin


In [20]:
# read with multiple threads
pq.read_table('my_parq', nthreads=4).to_pandas().head()

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job
11,43,married,secondary,no,264,yes,no,cellular,17,apr,113,2,-1,0,unknown,no,admin
17,37,single,tertiary,no,2317,yes,no,cellular,20,apr,114,1,152,2,failure,no,admin
29,53,married,secondary,no,105,no,yes,cellular,21,aug,74,2,-1,0,unknown,no,admin
35,42,divorced,secondary,no,1811,yes,no,unknown,14,may,150,1,-1,0,unknown,no,admin
49,61,married,unknown,no,4629,yes,no,cellular,27,jan,181,1,92,1,success,yes,admin
