Pandas - Read CSV

Dataset Download URL -> https://excelbianalytics.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/

In [1]:
%%time
import pandas as pandas_pd
pandas_df = pandas_pd.read_csv('sales/sales.csv')

CPU times: total: 9 s
Wall time: 9.64 s


Pandas - Write CSV as parquet

In [2]:
%%time
pandas_df.to_parquet('sales/pandas/sales.parquet')

CPU times: total: 6.88 s
Wall time: 6.9 s


Pyarrow - Read CSV

In [3]:
from pyarrow import csv, parquet
from datetime import datetime

In [4]:
%%time
pyarrow_df = csv.read_csv('sales/sales.csv')


CPU times: total: 9.91 s
Wall time: 1.09 s


Pyarrow - Write parquet file

In [5]:
%%time
parquet.write_table(pyarrow_df, 'sales/pyarrow/sales.parquet')

CPU times: total: 4.59 s
Wall time: 4.6 s


Pyarrow - Read parquet file

In [6]:
%%time
import pandas as pd
pd.read_parquet('sales/pyarrow/sales.parquet', engine='pyarrow')

CPU times: total: 6.41 s
Wall time: 2.58 s


Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,Australia and Oceania,Palau,Office Supplies,Online,H,3/6/2016,517073523,3/26/2016,2401,651.21,524.96,1563555.21,1260428.96,303126.25
1,Europe,Poland,Beverages,Online,L,4/18/2010,380507028,5/26/2010,9340,47.45,31.79,443183.00,296918.60,146264.40
2,North America,Canada,Cereal,Online,M,1/8/2015,504055583,1/31/2015,103,205.70,117.11,21187.10,12062.33,9124.77
3,Europe,Belarus,Snacks,Online,C,1/19/2014,954955518,2/27/2014,1414,152.58,97.44,215748.12,137780.16,77967.96
4,Middle East and North Africa,Oman,Cereal,Offline,H,4/26/2019,970755660,6/2/2019,7027,205.70,117.11,1445453.90,822931.97,622521.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,Middle East and North Africa,Iraq,Household,Online,L,3/17/2014,940436398,4/23/2014,4884,668.27,502.54,3263830.68,2454405.36,809425.32
4999996,Europe,Monaco,Meat,Offline,H,11/8/2015,407689177,11/28/2015,3142,421.89,364.69,1325578.38,1145855.98,179722.40
4999997,Australia and Oceania,Solomon Islands,Clothes,Online,C,6/1/2020,727000367,7/18/2020,4419,109.28,35.84,482908.32,158376.96,324531.36
4999998,Australia and Oceania,Marshall Islands,Cosmetics,Offline,L,2/12/2020,714043796,3/22/2020,282,437.20,263.33,123290.40,74259.06,49031.34


fastparquet - Read CSV  (Compression)

In [7]:
import pandas as fastpartquet_pd
from fastparquet import write, ParquetFile

In [8]:
%%time
fastpartquet_df = fastpartquet_pd.read_csv('sales/sales.csv')

CPU times: total: 8.64 s
Wall time: 8.67 s


fastparquet - Write to parquet with GZIP compression

In [9]:
%%time
fastpartquet_df.to_parquet("sales/fp_compress/sales.parquet", compression="GZIP")

CPU times: total: 13.2 s
Wall time: 13.2 s


fastparquet - Read compressed parquet file

In [10]:
%%time
import pandas as pd
pd.read_parquet('sales/fp_compress/sales.parquet', engine='fastparquet')

CPU times: total: 1.58 s
Wall time: 1.6 s


Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,Australia and Oceania,Palau,Office Supplies,Online,H,3/6/2016,517073523,3/26/2016,2401,651.21,524.96,1563555.21,1260428.96,303126.25
1,Europe,Poland,Beverages,Online,L,4/18/2010,380507028,5/26/2010,9340,47.45,31.79,443183.00,296918.60,146264.40
2,North America,Canada,Cereal,Online,M,1/8/2015,504055583,1/31/2015,103,205.70,117.11,21187.10,12062.33,9124.77
3,Europe,Belarus,Snacks,Online,C,1/19/2014,954955518,2/27/2014,1414,152.58,97.44,215748.12,137780.16,77967.96
4,Middle East and North Africa,Oman,Cereal,Offline,H,4/26/2019,970755660,6/2/2019,7027,205.70,117.11,1445453.90,822931.97,622521.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,Middle East and North Africa,Iraq,Household,Online,L,3/17/2014,940436398,4/23/2014,4884,668.27,502.54,3263830.68,2454405.36,809425.32
4999996,Europe,Monaco,Meat,Offline,H,11/8/2015,407689177,11/28/2015,3142,421.89,364.69,1325578.38,1145855.98,179722.40
4999997,Australia and Oceania,Solomon Islands,Clothes,Online,C,6/1/2020,727000367,7/18/2020,4419,109.28,35.84,482908.32,158376.96,324531.36
4999998,Australia and Oceania,Marshall Islands,Cosmetics,Offline,L,2/12/2020,714043796,3/22/2020,282,437.20,263.33,123290.40,74259.06,49031.34


fastparquet - Read CSV (No Compression)

In [11]:
import pandas as fastpartquet_pd
from fastparquet import write, ParquetFile

In [12]:
%%time
fastpartquet_df = fastpartquet_pd.read_csv('sales/sales.csv')

CPU times: total: 8.69 s
Wall time: 8.77 s


fastparquet - write parquet (no compression)

In [13]:
%%time
fastpartquet_df.to_parquet("sales/fp_nocompress/sales.parquet")

CPU times: total: 7.02 s
Wall time: 6.94 s


fastparquet - read parquet

In [14]:
%%time
import pandas as pd
pd.read_parquet('sales/fp_nocompress/sales.parquet', engine='fastparquet')

CPU times: total: 1.34 s
Wall time: 1.34 s


Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,Australia and Oceania,Palau,Office Supplies,Online,H,3/6/2016,517073523,3/26/2016,2401,651.21,524.96,1563555.21,1260428.96,303126.25
1,Europe,Poland,Beverages,Online,L,4/18/2010,380507028,5/26/2010,9340,47.45,31.79,443183.00,296918.60,146264.40
2,North America,Canada,Cereal,Online,M,1/8/2015,504055583,1/31/2015,103,205.70,117.11,21187.10,12062.33,9124.77
3,Europe,Belarus,Snacks,Online,C,1/19/2014,954955518,2/27/2014,1414,152.58,97.44,215748.12,137780.16,77967.96
4,Middle East and North Africa,Oman,Cereal,Offline,H,4/26/2019,970755660,6/2/2019,7027,205.70,117.11,1445453.90,822931.97,622521.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,Middle East and North Africa,Iraq,Household,Online,L,3/17/2014,940436398,4/23/2014,4884,668.27,502.54,3263830.68,2454405.36,809425.32
4999996,Europe,Monaco,Meat,Offline,H,11/8/2015,407689177,11/28/2015,3142,421.89,364.69,1325578.38,1145855.98,179722.40
4999997,Australia and Oceania,Solomon Islands,Clothes,Online,C,6/1/2020,727000367,7/18/2020,4419,109.28,35.84,482908.32,158376.96,324531.36
4999998,Australia and Oceania,Marshall Islands,Cosmetics,Offline,L,2/12/2020,714043796,3/22/2020,282,437.20,263.33,123290.40,74259.06,49031.34
