In [4]:
import pandas as pd
import numpy as np

FORMATS = ['clipboard', 'csv', 'feather', 'json', 'parquet', 'pickle', 'stata']

def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

In [7]:
for dataset_size, size in [['small', 1e2], ['medium', 1e4], ['large', 1e6], ['huge', 1e8]]:
    size = int(size)
    print(f"\n---===Testing {dataset_size} dataset, {size} rows===---")
    df = get_dataset(size)
    df = set_dtypes(df)
    for func in FORMATS:
        print('\t', func, 'writing')
        %time getattr(df, 'to_'+func)('dummy_file_name')
        print('\t', func, 'reading')
        %time getattr(pd, 'read_'+func)('dummy_file_name')



---===Testing small dataset, 100 rows===---
	 clipboard writing
CPU times: user 0 ns, sys: 11.4 ms, total: 11.4 ms
Wall time: 22.7 ms
	 clipboard reading
CPU times: user 0 ns, sys: 11.5 ms, total: 11.5 ms
Wall time: 24.8 ms
	 csv writing
CPU times: user 0 ns, sys: 3.39 ms, total: 3.39 ms
Wall time: 3.25 ms
	 csv reading
CPU times: user 1.33 ms, sys: 191 µs, total: 1.52 ms
Wall time: 1.47 ms
	 feather writing
CPU times: user 0 ns, sys: 2.72 ms, total: 2.72 ms
Wall time: 1.83 ms
	 feather reading
CPU times: user 2.21 ms, sys: 515 µs, total: 2.72 ms
Wall time: 2.07 ms
	 json writing
CPU times: user 728 µs, sys: 20 µs, total: 748 µs
Wall time: 661 µs
	 json reading
CPU times: user 5.1 ms, sys: 30 µs, total: 5.13 ms
Wall time: 5.1 ms
	 parquet writing
CPU times: user 1.65 ms, sys: 0 ns, total: 1.65 ms
Wall time: 1.6 ms
	 parquet reading
CPU times: user 3.57 ms, sys: 47 µs, total: 3.62 ms
Wall time: 2.41 ms
	 pickle writing
CPU times: user 722 µs, sys: 0 ns, total: 722 µs
Wall time: 649 µs


xsel: realloc error: Cannot allocate memory


CPU times: user 7min 27s, sys: 3.72 s, total: 7min 30s
Wall time: 7min 31s
	 clipboard reading
CPU times: user 3.66 ms, sys: 7.99 ms, total: 11.6 ms
Wall time: 22.5 ms
	 csv writing
CPU times: user 7min 26s, sys: 2.88 s, total: 7min 29s
Wall time: 7min 32s
	 csv reading
CPU times: user 31.7 s, sys: 2.64 s, total: 34.4 s
Wall time: 34.4 s
	 feather writing
CPU times: user 7.5 s, sys: 1.64 s, total: 9.14 s
Wall time: 5.29 s
	 feather reading
CPU times: user 1.62 s, sys: 917 ms, total: 2.53 s
Wall time: 1.09 s
	 json writing
CPU times: user 51.6 s, sys: 13.8 s, total: 1min 5s
Wall time: 1min 13s
	 json reading


ValueError: Could not reserve memory block

	 parquet writing
CPU times: user 6.21 s, sys: 1.03 s, total: 7.24 s
Wall time: 7.55 s
	 parquet reading
CPU times: user 4.58 s, sys: 1.66 s, total: 6.23 s
Wall time: 2.05 s
	 pickle writing
CPU times: user 120 ms, sys: 1.53 s, total: 1.65 s
Wall time: 3.72 s
	 pickle reading
CPU times: user 3.2 ms, sys: 545 ms, total: 548 ms
Wall time: 561 ms
	 stata writing
CPU times: user 4.57 s, sys: 2.65 s, total: 7.22 s
Wall time: 10.2 s
	 stata reading
CPU times: user 5.02 s, sys: 2.66 s, total: 7.69 s
Wall time: 7.71 s


## SUMMARY

### PICKLE is the best data format for python