# Pandas and Pyarrow

In [1]:
import random
import string
import numpy as np
import pandas as pd
from datetime import datetime
import pathlib
import pyarrow as pa
import pyarrow.csv 
%load_ext memory_profiler

Create a large dataset

In [2]:
%%time
def gen_random_string(length:int=32) -> str:
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
    
def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):

    dt = pd.date_range(start=start, end=end, freq=freq, name="timestamp")
    n = len(dt)
    np.random.seed = seed
    columns = {
        'date': dt,
        'cat': np.random.choice(['cat1','cat2','cat3','cat4','cat5'],n),
        'str1':[gen_random_string() for _ in range(n)],
        'str2':[gen_random_string() for _ in range(n)],
        'a': np.random.rand(n),
        'b': np.random.rand(n),
        'c': np.random.randint(1,100,n),
    }

    df = pd.DataFrame(columns, columns=columns)
    if df.index[-1] == end:
        df = df.iloc[:-1]
    return df

df = make_timeseries(start=datetime(2020,1,1), end=datetime(2023,12,31), freq='1min', seed=10)

CPU times: user 9.11 s, sys: 268 ms, total: 9.38 s
Wall time: 9.4 s


Print the fisrt rows to see what the data looks like.

In [3]:
df.head()

Unnamed: 0,date,cat,str1,str2,a,b,c
0,2020-01-01 00:00:00,cat2,73CM59C2JRCYKU97QRD0E4YA50T14RQG,T2BTG6BSWUV6WTJ2VRE37NDKZN5SEHK9,0.068295,0.508177,16
1,2020-01-01 00:01:00,cat5,62MEURYEVBEZA6BAZ8L1O9QJLTN1RP0B,ENE2OO4VP0XKP95GMI8DDBZ2EPYLB183,0.232802,0.514414,16
2,2020-01-01 00:02:00,cat5,NPYJU967P9W1Q59DKM2NALOT4KSMEGBC,BKAPXFGA95R5WBCVGZJIPN9T4L9RZAHJ,0.615268,0.229465,5
3,2020-01-01 00:03:00,cat3,WKDM4RTW70TIDJTZ1U8I1T2E6E481GKK,AF5YKLV99L14CLPOJBQ7RY25HS1OQSSB,0.059176,0.511924,4
4,2020-01-01 00:04:00,cat1,VH5SNZMQ4CF2A9EFUA25D2XU4XX504FW,SNUEJJ1ZMW20LQAFS1JTXBBQV6XQZ9LU,0.381258,0.968041,28


Print the shape of the dataframe

In [4]:
df.shape

(2102401, 7)

Print memory usage of the dataframe 

In [5]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2102401 entries, 0 to 2102400
Data columns (total 7 columns):
 #   Column  Dtype         
---  ------  -----         
 0   date    datetime64[ns]
 1   cat     object        
 2   str1    object        
 3   str2    object        
 4   a       float64       
 5   b       float64       
 6   c       int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 495.2 MB


## Write and read the dataframe in CSV format 

First we will time reading and writing the dataframe in CSV format, in order to compare the performance of pyarrow.

Write dataframe in CSF format

In [6]:
%%time
# TODO

CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 4.77 μs


In [7]:
%%time
df.to_csv("pandas.csv",index=False)

CPU times: user 6.54 s, sys: 133 ms, total: 6.68 s
Wall time: 6.68 s


To check the file on the disk, you can use the `stat()`method of `pathlib.Path`.

In [8]:
# TODO

In [9]:
pathlib.Path("pandas.csv").stat().st_size / 1024 /1024

265.55980587005615

Write dataframe in CSV format with compression

In [10]:
%%time
# TODO

CPU times: user 3 μs, sys: 1 μs, total: 4 μs
Wall time: 4.77 μs


In [11]:
%%time
df.to_csv("pandas.csv.gz",index=False,compression="gzip")

CPU times: user 24.3 s, sys: 127 ms, total: 24.5 s
Wall time: 24.6 s


Check the file size

In [12]:
# TODO

In [13]:
pathlib.Path("pandas.csv.gz").stat().st_size / 1024 /1024

149.00873851776123

Now, you can try to measure the performance in reading the dataframe in CSV format

First, try to read the CSV file

In [14]:
%%time
# TODO

CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 4.53 μs


In [15]:
%%time
df1 = pd.read_csv("pandas.csv")

CPU times: user 2.52 s, sys: 196 ms, total: 2.72 s
Wall time: 2.72 s


Then, try to read the compressed CSV file

In [16]:
%%time
# TODO

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 2.38 μs


In [17]:
%%time
df1 = pd.read_csv("pandas.csv.gz")

CPU times: user 3.72 s, sys: 188 ms, total: 3.91 s
Wall time: 3.91 s


## Use pyarrow

We will time reading and writing the dataframe in CSV format

First, copy the dataframe.

In [18]:
df_pa = df.copy()

Convert the data to pyarrow format

In [19]:
df_pa_table = pa.Table.from_pandas(df_pa)

In [20]:
df_pa_table

pyarrow.Table
date: timestamp[ns]
cat: string
str1: string
str2: string
a: double
b: double
c: int64
----
date: [[2020-01-01 00:00:00.000000000,2020-01-01 00:01:00.000000000,2020-01-01 00:02:00.000000000,2020-01-01 00:03:00.000000000,2020-01-01 00:04:00.000000000,...,2023-12-30 23:56:00.000000000,2023-12-30 23:57:00.000000000,2023-12-30 23:58:00.000000000,2023-12-30 23:59:00.000000000,2023-12-31 00:00:00.000000000]]
cat: [["cat2","cat5","cat5","cat3","cat1",...,"cat1","cat5","cat4","cat3","cat4"]]
str1: [["73CM59C2JRCYKU97QRD0E4YA50T14RQG","62MEURYEVBEZA6BAZ8L1O9QJLTN1RP0B","NPYJU967P9W1Q59DKM2NALOT4KSMEGBC","WKDM4RTW70TIDJTZ1U8I1T2E6E481GKK","VH5SNZMQ4CF2A9EFUA25D2XU4XX504FW",...,"Z3A1KWLYZOS6DOI1FYV3LP2S11WTMRHG","9G07GNRCNIMWSYPKOTBJ8UVPMYQM05FE","EKEFQ0LIGP0NTRYWL4FT174V2O3H6EHU","Z72VUJANELOE4IQBSUSKZMQ3D5ZSQZ9N","N8ZEK7UHYD2KTKYWGFF966WEU3LVLVQV"]]
str2: [["T2BTG6BSWUV6WTJ2VRE37NDKZN5SEHK9","ENE2OO4VP0XKP95GMI8DDBZ2EPYLB183","BKAPXFGA95R5WBCVGZJIPN9T4L9RZAHJ","AF5YKLV99L14CLPOJBQ

Write the dataframe with the `pyarrow.csv.write_csv` method

In [21]:
%%time
# TODO

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 4.77 μs


In [22]:
%%time
pyarrow.csv.write_csv(df_pa_table, "pyarrow.csv")

CPU times: user 462 ms, sys: 144 ms, total: 606 ms
Wall time: 605 ms


Check the file size

In [23]:
# TODO

In [24]:
pathlib.Path("pyarrow.csv").stat().st_size / 1024 /1024

297.6403512954712

Write the dataframe with compression

In [25]:
%%time
with pa.CompressedOutputStream("pyarrow.csv.gz","gzip") as out:
    pyarrow.csv.write_csv(df_pa_table,out)

CPU times: user 10.3 s, sys: 91 ms, total: 10.4 s
Wall time: 10.4 s


Check the file size

In [26]:
# TODO

In [27]:
pathlib.Path("pyarrow.csv.gz").stat().st_size / 1024 /1024

152.28399658203125

Read the dataframe. You can use the parameter `engine=pyarrow` in `pandas.read_csv`.

In [28]:
%%time
# TODO

CPU times: user 1 μs, sys: 0 ns, total: 1 μs
Wall time: 1.91 μs


In [29]:
%%time 
df_pa_1 = pd.read_csv("pyarrow.csv",engine="pyarrow")

CPU times: user 1.59 s, sys: 713 ms, total: 2.3 s
Wall time: 958 ms


In [30]:
df_pa_1.head()

Unnamed: 0,date,cat,str1,str2,a,b,c
0,2020-01-01 00:00:00,cat2,73CM59C2JRCYKU97QRD0E4YA50T14RQG,T2BTG6BSWUV6WTJ2VRE37NDKZN5SEHK9,0.068295,0.508177,16
1,2020-01-01 00:01:00,cat5,62MEURYEVBEZA6BAZ8L1O9QJLTN1RP0B,ENE2OO4VP0XKP95GMI8DDBZ2EPYLB183,0.232802,0.514414,16
2,2020-01-01 00:02:00,cat5,NPYJU967P9W1Q59DKM2NALOT4KSMEGBC,BKAPXFGA95R5WBCVGZJIPN9T4L9RZAHJ,0.615268,0.229465,5
3,2020-01-01 00:03:00,cat3,WKDM4RTW70TIDJTZ1U8I1T2E6E481GKK,AF5YKLV99L14CLPOJBQ7RY25HS1OQSSB,0.059176,0.511924,4
4,2020-01-01 00:04:00,cat1,VH5SNZMQ4CF2A9EFUA25D2XU4XX504FW,SNUEJJ1ZMW20LQAFS1JTXBBQV6XQZ9LU,0.381258,0.968041,28


In [31]:
%%time 
df_pa_1 = pd.read_csv("pyarrow.csv.gz",engine="pyarrow")

CPU times: user 2.66 s, sys: 445 ms, total: 3.1 s
Wall time: 2.04 s


In [32]:
df_pa_1.head()

Unnamed: 0,date,cat,str1,str2,a,b,c
0,2020-01-01 00:00:00,cat2,73CM59C2JRCYKU97QRD0E4YA50T14RQG,T2BTG6BSWUV6WTJ2VRE37NDKZN5SEHK9,0.068295,0.508177,16
1,2020-01-01 00:01:00,cat5,62MEURYEVBEZA6BAZ8L1O9QJLTN1RP0B,ENE2OO4VP0XKP95GMI8DDBZ2EPYLB183,0.232802,0.514414,16
2,2020-01-01 00:02:00,cat5,NPYJU967P9W1Q59DKM2NALOT4KSMEGBC,BKAPXFGA95R5WBCVGZJIPN9T4L9RZAHJ,0.615268,0.229465,5
3,2020-01-01 00:03:00,cat3,WKDM4RTW70TIDJTZ1U8I1T2E6E481GKK,AF5YKLV99L14CLPOJBQ7RY25HS1OQSSB,0.059176,0.511924,4
4,2020-01-01 00:04:00,cat1,VH5SNZMQ4CF2A9EFUA25D2XU4XX504FW,SNUEJJ1ZMW20LQAFS1JTXBBQV6XQZ9LU,0.381258,0.968041,28


In [33]:
df_pa_1.memory_usage(deep=True)

Index          132
date      16819208
cat      111427253
str1     170294481
str2     170294481
a         16819208
b         16819208
c         16819208
dtype: int64

## Convert to pyarrow types

It is also possible to use Pyarrow type PyArrow data structure integration is implemented through pandas’ ExtensionArray interface.
Using Pyarrow datatypes enable to accelerate processing with PyArrow compute functions where available.

In [34]:
df_pa = df.copy()

In [35]:
df_pa.columns

Index(['date', 'cat', 'str1', 'str2', 'a', 'b', 'c'], dtype='object')

Try to convert types with Pyarrow types.

In [36]:
# TODO

In [37]:
df_pa["a"] = df_pa["a"].astype("float64[pyarrow]")
df_pa["b"] = df_pa["b"].astype("float64[pyarrow]")
df_pa["c"] = df_pa["c"].astype("int16[pyarrow]")

Check the types of the columns

In [38]:
df_pa.dtypes

date     datetime64[ns]
cat              object
str1             object
str2             object
a       double[pyarrow]
b       double[pyarrow]
c        int16[pyarrow]
dtype: object

In [39]:
df_pa.memory_usage(deep=True)

Index          132
date      16819208
cat      111427253
str1     170294481
str2     170294481
a         16819208
b         16819208
c          4204802
dtype: int64