In [3]:
import pandas as pd
import numpy as np
!pip3 install pyarrow


Collecting pyarrow
  Downloading pyarrow-10.0.1-cp39-cp39-macosx_10_14_x86_64.whl (25.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.1/25.1 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Installing collected packages: pyarrow
Successfully installed pyarrow-10.0.1


### Create the dataset

In [4]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df
print('Reading and writing CSV')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_csv('test.csv')
%time df_csv = pd.read_csv('test.csv')

print('Reading and writing Pickle')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_pickle('test.pickle')
%time df_pickle = pd.read_pickle('test.pickle')

print('Reading and writing Parquet')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_parquet('test.parquet')
%time df_parquet = pd.read_parquet('test.parquet')

print('Reading and writing Feather')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_feather('test.feather')
%time df_feather = pd.read_feather('test.feather')

Reading and writing CSV
CPU times: user 13.6 s, sys: 236 ms, total: 13.9 s
Wall time: 13.9 s
CPU times: user 2.03 s, sys: 191 ms, total: 2.22 s
Wall time: 2.27 s
Reading and writing Pickle
CPU times: user 2.12 ms, sys: 11.6 ms, total: 13.7 ms
Wall time: 17.6 ms
CPU times: user 1.09 ms, sys: 18.4 ms, total: 19.5 ms
Wall time: 30.3 ms
Reading and writing Parquet
CPU times: user 495 ms, sys: 84.2 ms, total: 579 ms
Wall time: 10.1 s
CPU times: user 264 ms, sys: 189 ms, total: 453 ms
Wall time: 4.34 s
Reading and writing Feather
CPU times: user 271 ms, sys: 28 ms, total: 299 ms
Wall time: 1.04 s
CPU times: user 123 ms, sys: 59.9 ms, total: 183 ms
Wall time: 130 ms
