In [1]:
import pandas as pd
import numpy as np

In [4]:
def get_dataset(size):
    df = pd.DataFrame()
    
    df['size'] = np.random.choice(['big', 'medium', 'small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['dq'] = df['dq'].map({'yes': True, 'no': False})
    df['prob'] = df['prob'].astype('float16')
    
    return df

In [11]:
%%timeit
df = get_dataset(1_000_000)
df.to_csv('test_csv.csv', index=True)

3.49 s ± 58.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%timeit
df = pd.read_csv('test_csv.csv', index_col=[0])

632 ms ± 62.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
df = get_dataset(1_000_000)
df.to_pickle('test_pickle.pickle')

1.43 s ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
df = pd.read_pickle('test_pickle.pickle')

272 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit 
df = get_dataset(1_000_000)
df.to_parquet('test_parquet.parquet')

823 ms ± 19.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit
pd.read_parquet('test_parquet.parquet')

133 ms ± 7.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
df = get_dataset(1_000_000)
df.to_feather('test_feather.feather')

622 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit
pd.read_feather('test_feather.feather')

109 ms ± 3.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
